Introduction

This project analyzes Medicare provider data for the Tampa Bay region to understand service patterns, cost structures, and geopgraphic distributions. We utilize association rule mining, k-means clustering, linear regression, and geospatial visualization to explore relationships within the data. The primary data source is the CMS Medicare Provider Utilization and Payment Data Public Use File. Our goal is to identify key trends and factors influencing healthcare delivery and costs in this specific geographic area.

Setup: Loading Libraries and Data

Load Required Libraries

# Add your library below.
# List of required packages
required_packages <- c(
  "tidyverse",
  "GGally",
  "ggplot2",
  "sf",
  "dplyr",
  "readr",
  "arules",
  "leaflet",
  "tigris",
  "scales",
  "htmlwidgets",
  "jsonlite",
  "purrr",
  "stringr",
  "lme4",
  "broom.mixed",
  "tibble",
  "grepel"
)

# Install any packages that are not already installed
to_install <- setdiff(required_packages, rownames(installed.packages()))
if (length(to_install) > 0) {
  install.packages(to_install, dependencies = TRUE, repos = "https://cloud.r-project.org/")
}

# Load all required packages and check if successfully loaded
for (pkg in required_packages) {
  if (!require(pkg, character.only = TRUE)) {
    warning(sprintf("Package '%s' could not be loaded.", pkg))
  }
}

# Enable caching for tigris package
options(tigris_use_cache = TRUE)

# Tampa Bay ZIP codes
tampa_bay_zips <- as.numeric(c(
  "34423","34428","34429","34430","34431","34432","34433","34434","34436","34442","34445",
  "34446","34447","34448","34449","34450","34451","34452","34453","34461","34465","34601","34602","34604","34605",
  "34606","34607","34608","34609","34613","34614","33510","33511","33527","33534","33547","33548","33549","33556",
  "33558","33559","33563","33565","33566","33567","33569","33570","33572","33573","33578","33579","33584","33586",
  "33592","33594","33596","33602","33603","33604","33605","33606","33607","33609","33610","33611","33612","33613",
  "33614","33615","33616","33617","33618","33619","33620","33621","33624","33625","33626","33629","33634","33635",
  "33637","33647","34201","34202","34203","34205","34207","34208","34209","34210","34211","34212","34215","34216",
  "34217","34219","34221","34222","33523","33524","33525","33537","33539","33540","33541","33542","33543","33544",
  "33545","33576","34610","34637","34638","34639","34652","34653","34654","34655","34667","34668","34669","34673",
  "34674","34679","34680","34690","34691","33701","33702","33703","33704","33705","33706","33707","33708","33709",
  "33710","33711","33712","33713","33714","33715","33716","33755","33756","33759","33760","33761","33762","33763",
  "33764","33765","33767","33770","33771","33772","33773","33774","33776","33777","33778","33781","33782","33785",
  "34677","34681","34683","34684","34685","34688","34689","33801","33803","33805","33809","33810","33811","33812",
  "33813","33815","33823","33825","33830","33834","33837","33838","33839","33841","33843","33844","33847","33849",
  "33850","33851","33853","33859","33860","33867","33868","33870","33873","33875","33876","33877","33880","33881",
  "33882","33884","33896","33898","34231","34232","34233","34234","34235","34236","34237","34238","34239","34240",
  "34241","34242","34243","34251","34275","34276","34277","34285","34286","34287","34288","34289","34292","34293",
  "34295"
))

Cleaning the dataset: Sneh Patel, Steven Barden, Christopher Reddish, and Scott Young

# Assigning a datapath for the filtered_data.csv
filtered_data_file_path <- "Data/filtered_data.csv"

# Bypassing the cleaning step if the file exists from previous use
if (!file.exists(filtered_data_file_path)) {
  filtered_data <- read_csv("Data/Medicare_dataset.csv", show_col_types = FALSE) %>%
# Amending mistakenly attributed ZIP codes from original dataset
    mutate(Rndrng_Prvdr_Zip5 = ifelse(Rndrng_Prvdr_Zip5 %in% c("19107", "19144"), "33308", Rndrng_Prvdr_Zip5)) %>%
    filter(grepl("^\\d+$", Rndrng_Prvdr_Zip5)) %>%
    mutate(Rndrng_Prvdr_Zip5 = as.numeric(Rndrng_Prvdr_Zip5)) %>%
# Filter by Tampa Bay ZIP codes
    filter(Rndrng_Prvdr_Zip5 %in% tampa_bay_zips) %>%
# Rename columns to be more immediately understandable  
    rename(
      NPI = Rndrng_NPI,
      Provider_Last_Name = Rndrng_Prvdr_Last_Org_Name,
      Provider_First_Name = Rndrng_Prvdr_First_Name,
      Provider_MI = Rndrng_Prvdr_MI,
      Provider_Credentials = Rndrng_Prvdr_Crdntls,
      Entity_Code = Rndrng_Prvdr_Ent_Cd,
      Provider_Street1 = Rndrng_Prvdr_St1,
      Provider_Street2 = Rndrng_Prvdr_St2,
      Provider_City = Rndrng_Prvdr_City,
      Provider_State = Rndrng_Prvdr_State_Abrvtn,
      FIPS_Code = Rndrng_Prvdr_State_FIPS,
      Provider_Zip = Rndrng_Prvdr_Zip5,
      RUCA_Code = Rndrng_Prvdr_RUCA,
      RUCA_Description = Rndrng_Prvdr_RUCA_Desc,
      Provider_Country = Rndrng_Prvdr_Cntry,
      Provider_Type = Rndrng_Prvdr_Type,
      Medicare_Participation = Rndrng_Prvdr_Mdcr_Prtcptg_Ind,
      HCPCS_Code = HCPCS_Cd,
      HCPCS_Description = HCPCS_Desc,
      HCPCS_Drug_Indicator = HCPCS_Drug_Ind,
      Place_Of_Service = Place_Of_Srvc,
      Total_Beneficiaries = Tot_Benes,
      Total_Services = Tot_Srvcs,
      Total_Beneficiary_Service_Days = Tot_Bene_Day_Srvcs,
      Avg_Submitted_Charge = Avg_Sbmtd_Chrg,
      Avg_Medicare_Allowed = Avg_Mdcr_Alowd_Amt,
      Avg_Medicare_Payment = Avg_Mdcr_Pymt_Amt,
      Avg_Medicare_Standardized_Amt = Avg_Mdcr_Stdzd_Amt
    ) %>%
# Removing currency symbol from the following columns      
    mutate(
      across(
        .cols = c(Avg_Medicare_Payment, Avg_Medicare_Allowed, Avg_Submitted_Charge, Avg_Medicare_Standardized_Amt),
        .fns = ~ gsub("\\$", "", as.character(.))
        )
      ) %>%
    select(-Provider_First_Name, -Provider_MI)

# Save the filtered data to a new CSV in the 'data' folder
write.csv(filtered_data, "Data/filtered_data.csv", row.names = FALSE)

# Print confirmation
print(paste("Filtered data saved to: Data/filtered_data.csv"))
} else{
  print(paste("File '", filtered_data_file_path, "' already exists. Skipping creation."))
}
## [1] "File ' Data/filtered_data.csv ' already exists. Skipping creation."

Load Data Once If Clean Set Exists

Read in the filtered .csv file for easier use without having to load the initial dataset.

#Load the filtered data ONCE after cleaning and take care of problems of FIPS code error and zip map for k-mean map
filtered_data <- read_csv("Data/filtered_data.csv",
                          col_types = cols(FIPS_Code = col_character())) %>%
  mutate(FIPS_Code = as.numeric(gsub("[^0-9]", "", FIPS_Code)), Provider_Zip = sprintf("%05s", as.character(Provider_Zip)))

Association Rule Mining: Christopher Reddish, Scott Young

📘 Introduction

This project detects potentially suspicious billing behavior among Medicare providers in the Tampa Bay area. Using association rule mining and a composite scoring model, it flags providers who may warrant audit review.

🔹 Step 1: Clean and Prepare Data

filtered_data <- read.csv("Data/filtered_data.csv")

clean_df <- filtered_data %>%
  mutate(
    Avg_Medicare_Payment = as.numeric(Avg_Medicare_Payment)
  ) %>%
  filter(
    !is.na(Provider_Type),
    !is.na(HCPCS_Code),
    !is.na(NPI),
    as.numeric(Avg_Medicare_Payment) >= 10
  ) %>%
  mutate(
    NPI = as.character(NPI),
    HCPCS_Code = as.character(HCPCS_Code),
    Provider_Type = as.factor(trimws(Provider_Type))
  ) %>%
  select(NPI, HCPCS_Code, Avg_Medicare_Payment, Provider_Type, Provider_Last_Name)

🔹 Step 2: Generate Association Rules by Specialty

# Initialize lists to store results
all_rule_matches <- tibble()      # For NPI-rule match scoring in Step 3
provider_summary_list <- list()   # For rule-based provider summaries

# Step 2A: Split the dataset by specialty (Provider_Type)
specialty_groups <- clean_df %>%
  group_by(Provider_Type) %>%
  group_split()

# Step 2B: Loop through each specialty subset
for (spec_df in specialty_groups) {
  spec_name <- as.character(unique(spec_df$Provider_Type))
  if (nrow(spec_df) < 2) next

  # Step 2C: Convert each NPI into a transaction of billed HCPCS codes
  provider_tx <- spec_df %>%
    group_by(NPI) %>%
    summarise(items = list(unique(HCPCS_Code)), .groups = "drop")

  tx_list <- lapply(split(provider_tx$items, provider_tx$NPI), unlist)
  if (length(tx_list) < 2) next

  trans <- as(tx_list, "transactions")

  # Step 2D: Generate association rules for this specialty
  rules <- apriori(trans, parameter = list(supp = 0.02, conf = 0.7, maxlen = 2))
  if (length(rules) == 0) next

  # Step 2E: Filter rules by lift and convert to data frame
  rules_df_raw <- as(rules, "data.frame")
  rules_df_filtered <- rules_df_raw %>% filter(lift >= 2)
  if (nrow(rules_df_filtered) == 0) next

  # Add readable labels and parse rule structure for each rule
  rules_df_filtered$lhs <- labels(lhs(rules))[as.integer(rownames(rules_df_filtered))]
  rules_df_filtered$rhs <- labels(rhs(rules))[as.integer(rownames(rules_df_filtered))]
  rules_df_filtered <- rules_df_filtered %>%
    mutate(
      rule_label = paste(lhs, "=>", rhs),
      lhs_items = strsplit(gsub("[{}]", "", lhs), ","),
      rhs_items = strsplit(gsub("[{}]", "", rhs), ","),
      all_items = map2(lhs_items, rhs_items, ~ unique(trimws(c(.x, .y))))
    )

  # Step 2F: Match rules to providers
  provider_rule_map <- list()
  npi_rule_log <- tibble()

  for (npi in names(tx_list)) {
    provider_items <- tx_list[[npi]]

    matched <- rules_df_filtered %>%
      filter(map_lgl(all_items, ~ all(.x %in% provider_items)))

    if (nrow(matched) > 0) {
      provider_rule_map[[npi]] <- matched$rule_label

      npi_rule_log <- bind_rows(npi_rule_log, tibble(
        NPI = npi,
        Rule_Label = matched$rule_label,
        Lift = matched$lift,
        Confidence = matched$confidence
      ))
    }
  }

  # Step 2G: Create provider summary stats for rule match counts
  if (length(provider_rule_map) > 0) {
    provider_summary <- tibble(
      Specialty = spec_name,
      NPI = names(provider_rule_map),
      Total_Rules = sapply(provider_rule_map, function(x) length(unique(x)))
    )

    hcpcs_diversity <- spec_df %>%
      group_by(NPI) %>%
      summarise(Unique_Codes = n_distinct(HCPCS_Code), .groups = "drop")

    provider_summary <- provider_summary %>%
      left_join(hcpcs_diversity, by = "NPI") %>%
      mutate(
        Rules_Per_Code = round(Total_Rules / Unique_Codes, 2),
        Unique_Codes_Per_Rule = round(Unique_Codes / Total_Rules, 4)
      )

    provider_summary_list[[spec_name]] <- provider_summary
    all_rule_matches <- bind_rows(all_rule_matches, npi_rule_log)
  }
}

# Export objects for Step 3
final_provider_results <- bind_rows(provider_summary_list)
npi_rule_scores <- all_rule_matches %>%
  group_by(NPI) %>%
  summarise(
    mean_lift = mean(Lift, na.rm = TRUE),
    mean_confidence = mean(Confidence, na.rm = TRUE),
    .groups = "drop"
  )

🔹 Step 3: Step 3A: Merge all provider summaries across specialties into one dataset

# Step 3A: Merge rule-based stats into the full cleaned Medicare dataset
merged_df <- clean_df %>%
  left_join(final_provider_results, by = "NPI")

# Step 3B: Append average lift and confidence per provider from rule matches
scored_providers <- merged_df %>%
  left_join(npi_rule_scores, by = "NPI")

# Step 3C: Z-score standardization for Rules_Per_Code within each specialty
scored_providers <- scored_providers %>%
  group_by(Provider_Type) %>%
  mutate(Z_Score_RPC = as.numeric(scale(Rules_Per_Code))) %>%
  ungroup()

# Step 3D: Compute a composite audit score
# This blends:
#   - billing repetition
#   - rule lift
#   - rule confidence
#   - average Medicare payout
#   - relative specialty behavior
scored_providers <- scored_providers %>%
  mutate(
    Composite_Audit_Score = round(
      (coalesce(Rules_Per_Code, 0) * 1.5) +
      (coalesce(mean_lift, 0) * 2.0) +
      (coalesce(mean_confidence, 0) * 1.0) +
      (coalesce(as.numeric(Avg_Medicare_Payment, 0) / 100)) +
      coalesce(Z_Score_RPC, 0),
      2
    )
  )

# Step 3E: Append sample size per provider for audit traceability
scored_providers <- scored_providers %>%
  group_by(NPI) %>%
  mutate(Sample_Size = n()) %>%
  ungroup()

🔹 Step 4: Risk Stratification & Visualization

# Step 4A: Flag audit risk levels using percentile thresholds
# ------------------------------------------------------------
# We classify providers within each specialty based on Composite Score:
# - Top 2% = Extreme High Risk
# - 92–98% = High Risk
# - 80–92% = Moderate Risk
# - Bottom 80% = Low/No Risk

scored_flagged <- scored_providers %>%
  group_by(Provider_Type) %>%
  mutate(
    p80 = quantile(Composite_Audit_Score, 0.80, na.rm = TRUE),
    p92 = quantile(Composite_Audit_Score, 0.92, na.rm = TRUE),
    p98 = quantile(Composite_Audit_Score, 0.98, na.rm = TRUE),
    Audit_Flag = case_when(
      Composite_Audit_Score >= p98 ~ "Extreme High Risk (98–100%)",
      Composite_Audit_Score >= p92 ~ "High Risk (92–98%)",
      Composite_Audit_Score >= p80 ~ "Moderate Risk (80–92%)",
      TRUE                         ~ "Low/No Risk (0–80%)"
    )
  ) %>%
  ungroup()

# Step 4B: Filter to valid (non-NA, >0) composite scores
valid_scores_df <- scored_flagged %>%
  filter(!is.na(Composite_Audit_Score), Composite_Audit_Score > 0)

# Step 4C: Manually set your "Top 15 Most Interesting" specialties
top15_specialties <- c(
  "Anesthesiology", "Cardiology", "Dermatology", "Diagnostic Radiology",
  "Emergency Medicine", "Family Practice", "Gastroenterology", "General Surgery",
  "Internal Medicine", "Interventional Cardiology", "Nurse Practitioner",
  "Orthopedic Surgery", "Physician Assistant", "Podiatry", "Vascular Surgery"
)

# Step 4D: Filter only the specialties of interest and cap extreme outliers
volume_filtered_df <- valid_scores_df %>%
  filter(
    Provider_Type %in% top15_specialties,
    Composite_Audit_Score <= quantile(Composite_Audit_Score, 0.99, na.rm = TRUE),
    Avg_Medicare_Payment <= 1000
  )

# Step 4E: Generate plot with fixed x and y axis, no log scale
composite_plot <- ggplot(volume_filtered_df, aes(
  x = Avg_Medicare_Payment,
  y = Composite_Audit_Score,
  color = Audit_Flag
)) +
  geom_point(alpha = 0.7) +
  facet_wrap(~ Provider_Type, scales = "fixed", nrow = 5) +
  scale_color_manual(
    values = c(
      "Extreme High Risk (98–100%)" = "purple",
      "High Risk (92–98%)" = "red",
      "Moderate Risk (80–92%)" = "gold",
      "Low/No Risk (0–80%)" = "green"
    )
  ) +
  scale_x_continuous(
    labels = scales::dollar_format(),
    breaks = c(0, 250, 500, 750, 1000),
    limits = c(0, 1000)
  ) +
  scale_y_continuous(
    limits = c(0, 85),
    breaks = seq(0, 80, by = 20)
  ) +
  theme_minimal(base_size = 13) +
  theme(strip.text = element_text(size = 10)) +
  labs(
    title = "Composite Audit Score vs. Medicare Payment",
    subtitle = "Top 15 Most Interesting Specialties – Risk Segmentation at 80/92/98 Percentiles",
    x = "Average Medicare Payment ($)",
    y = "Composite Audit Score",
    color = "Audit Risk Level"
  )

# Step 4F: Save output as PNG
ggsave("Top15_Composite_vs_Payment_BySpecialty_FIXEDAXIS.png",
       plot = composite_plot, dpi = 300, width = 14, height = 9)

# Step 4G: Render in output
composite_plot

🔹 Step 5: Top Providers Report

# Step 5A: Join Provider_Last_Name from filtered_data, and rename it cleanly
scored_named <- scored_providers %>%
  left_join(
    filtered_data %>%
      mutate(NPI = as.character(NPI)) %>%
      select(NPI, Provider_Last_Name),
    by = "NPI"
  ) %>%
  rename(Provider_Last_Name = Provider_Last_Name.y)  # in case it became .y

# Step 5B: Top 50 providers overall
top50_overall <- scored_named %>%
  distinct(NPI, .keep_all = TRUE) %>%
  arrange(desc(Composite_Audit_Score)) %>%
  slice_head(n = 50)

# Display Top 50 Table
knitr::kable(
  top50_overall %>%
    select(Provider_Last_Name, NPI, Provider_Type, Composite_Audit_Score),
  caption = "Top 50 Providers Overall by Composite Audit Score"
)
Top 50 Providers Overall by Composite Audit Score
Provider_Last_Name NPI Provider_Type Composite_Audit_Score
Quest Diagnostics Clinical Laboratories Inc 1891731626 Clinical Laboratory 435.67
Laboratory Corporation Of America 1255314704 Clinical Laboratory 395.91
Vanvliet 1841372968 Critical Care (Intensivists) 124.67
Pine Brook Pharmacy Llc 1558596932 Clinical Laboratory 97.41
Rojas 1134383060 Pathology 96.04
Mittal 1992742084 Pathology 96.04
Mathew 1598296097 Pathology 96.02
Martens 1144683467 Pathology 95.23
Zhao 1578603767 Pathology 95.21
Patel 1710368501 Pathology 95.07
Lee 1871628529 Pathology 94.93
Miller 1922111699 Pathology 94.85
Robens 1619116407 Pathology 94.82
Mcmullen 1740203389 Pathology 94.82
Sher 1952757411 Pathology 94.78
Ficarra 1326220062 Pathology 94.76
Xu 1770743593 Pathology 94.73
Lewis 1326070962 Pathology 94.68
Barton 1467431247 Pathology 94.68
Lancia 1700959590 Pathology 94.51
Mcpherson 1538184320 Pathology 94.48
Migliozzi 1013064021 Pathology 94.37
Lastarria 1437241304 Pathology 94.29
Angell 1205870870 Thoracic Surgery 92.30
Vogelbaum 1598732380 Neurosurgery 90.57
Etame 1639292287 Neurosurgery 89.56
Haydon 1669677514 Neurosurgery 89.30
Carvallo 1053416412 Nephrology 87.40
Badillo 1871588350 Nephrology 84.57
Abdel Rahman 1134383615 Nephrology 84.54
Rizzo 1801870407 Nephrology 84.33
Toledo-Nieves 1750838702 Nephrology 83.78
Watson Clinic Llp 1558409730 Clinical Laboratory 83.12
Elite Bio Reference Laboratory Llc 1275292294 Clinical Laboratory 82.95
Moodey 1124231774 Critical Care (Intensivists) 81.93
Hematopathology Associates Llc 1164523478 Clinical Laboratory 81.24
International Medical Laboratory Inc 1376518241 Clinical Laboratory 79.01
Angirekula 1710942206 Pain Management 77.26
Campbell 1548377906 Critical Care (Intensivists) 76.90
Baldonado 1992221444 Thoracic Surgery 76.84
Boyle 1003938366 Pathology 76.12
Bossler 1679554612 Pathology 76.12
Qin 1881787877 Pathology 76.12
Saeed-Vafa 1932433570 Pathology 75.87
Bowes Imaging Center Llc. 1316985658 Independent Diagnostic Testing Facility (IDTF) 75.20
Coppola 1730119454 Pathology 73.76
Williams 1598744393 Pathology 73.57
Balsera 1598773376 Nephrology 73.39
Lubin 1508834706 Pain Management 73.04
Bovee 1427346683 Ophthalmology 72.91
# Step 5C: Get Top 3 Providers by Specialty without repeating NPIs
top3_by_specialty <- scored_named %>%
  filter(!is.na(Composite_Audit_Score)) %>%
  group_by(Provider_Type, NPI) %>%
  slice_max(order_by = Composite_Audit_Score, n = 1) %>%  # get max score per NPI
  ungroup() %>%
  distinct(NPI, .keep_all = TRUE) %>%                     # remove any duplicate NPIs
  group_by(Provider_Type) %>%
  slice_max(order_by = Composite_Audit_Score, n = 3) %>%  # now take top 3 per specialty
  ungroup()

# Display Top 3 Per Specialty Table
knitr::kable(
  top3_by_specialty %>%
    select(Provider_Type, Provider_Last_Name, NPI, Composite_Audit_Score),
  caption = "Top 3 Providers by Composite Score in Each Specialty"
)
Top 3 Providers by Composite Score in Each Specialty
Provider_Type Provider_Last_Name NPI Composite_Audit_Score
Addiction Medicine Amen 1831300995 0.98
Advanced Heart Failure and Transplant Cardiology Kumar 1538267661 16.88
Advanced Heart Failure and Transplant Cardiology Oliveira 1699879098 14.69
Advanced Heart Failure and Transplant Cardiology Arroyo 1811965650 12.09
All Other Suppliers Brashears Vital Care Corp 1710912662 0.70
Allergy/ Immunology Cho 1093917304 26.84
Allergy/ Immunology Sher 1386603108 23.84
Allergy/ Immunology Alshaar 1669411740 17.88
Ambulance Service Provider City Of Temple Terrace 1134177389 16.41
Ambulance Service Provider Affordable Transport Inc. 1225279318 15.24
Ambulance Service Provider Americare Ambulance Service Inc 1073543021 15.21
Ambulatory Surgical Center Coastal Medical Center Llc 1538231956 41.57
Ambulatory Surgical Center Citrus Urology Center, Inc. 1821068289 37.57
Ambulatory Surgical Center Riverwalk Ambulatory Surgery Center Llc 1588839609 34.62
Anesthesiology Beebe 1831395896 55.49
Anesthesiology Fitzpatrick 1750764148 54.11
Anesthesiology Kaiafas 1386676773 53.13
Anesthesiology Assistant Pierre 1275925109 26.38
Anesthesiology Assistant Hess 1013903046 24.75
Anesthesiology Assistant Baltodano 1972157824 22.76
Audiologist Malkiewicz 1922475805 37.52
Audiologist Chen 1053048991 33.71
Audiologist Schenk 1154592293 33.69
Cardiac Surgery Ezeldin 1639129133 69.60
Cardiac Surgery Vesco 1487641098 45.96
Cardiac Surgery Sanabria 1386698041 45.52
Cardiology Carrillo 1821092370 44.95
Cardiology Makati 1467541953 44.37
Cardiology De Azevedo Filho 1619467529 43.95
Centralized Flu Walgreen Co 1396750584 13.30
Centralized Flu Walgreen Co 1174538474 13.28
Centralized Flu Walgreen Co 1558376731 13.27
Certified Clinical Nurse Specialist Allspaw 1154380921 1.61
Certified Nurse Midwife Mccormick 1558366336 0.98
Certified Nurse Midwife Harrington 1366430449 0.93
Certified Nurse Midwife Dibartolo 1952812042 0.65
Certified Registered Nurse Anesthetist (CRNA) Alea 1881662187 40.67
Certified Registered Nurse Anesthetist (CRNA) Alvarodiaz 1427199165 40.55
Certified Registered Nurse Anesthetist (CRNA) Horowitz 1891893236 40.53
Chiropractic Bretz 1124128319 0.41
Chiropractic Paisley 1730616848 0.41
Chiropractic Tellier 1750452868 0.41
Clinical Cardiac Electrophysiology Perzanowski-Obregon 1649229758 66.63
Clinical Cardiac Electrophysiology Yamamura 1083603989 64.86
Clinical Cardiac Electrophysiology Cardona 1588626410 62.52
Clinical Laboratory Quest Diagnostics Clinical Laboratories Inc 1891731626 439.31
Clinical Laboratory Laboratory Corporation Of America 1255314704 402.49
Clinical Laboratory Pine Brook Pharmacy Llc 1558596932 101.32
Colorectal Surgery (Proctology) Frattini 1831367424 53.34
Colorectal Surgery (Proctology) Shore 1235138983 38.26
Colorectal Surgery (Proctology) Honer 1801864079 36.12
Critical Care (Intensivists) Vanvliet 1841372968 132.50
Critical Care (Intensivists) Moodey 1124231774 82.51
Critical Care (Intensivists) Campbell 1548377906 77.46
Dermatology Mavropoulos 1063737682 39.93
Dermatology Miller 1023007242 31.42
Dermatology Esguerra 1205825486 29.24
Diagnostic Radiology Dawson 1033537725 41.82
Diagnostic Radiology Murtagh 1538142567 41.30
Diagnostic Radiology Clark 1902864341 41.22
Emergency Medicine Regan 1891719050 52.37
Emergency Medicine Odome 1245641240 52.10
Emergency Medicine Ratcliffe 1952339103 50.25
Endocrinology Antunes 1720032253 35.90
Endocrinology Piotrowska 1205151453 34.27
Endocrinology Pinero-Pilona 1720082068 33.52
Family Practice Davson Sterling 1861709610 29.96
Family Practice Reddy 1003377268 29.33
Family Practice Khurana 1952357816 29.29
Gastroenterology George 1467804179 49.90
Gastroenterology Khan 1013928282 49.23
Gastroenterology Castineira 1760830731 48.37
General Practice Hammarquist 1033650551 61.19
General Practice Rudelli 1184260283 61.03
General Practice Perkins 1700229838 47.38
General Surgery Mitchell 1033166277 77.89
General Surgery Parrack 1134369002 77.42
General Surgery Ruan 1285711275 77.24
Geriatric Medicine Kienle 1740398676 28.90
Geriatric Medicine Cinelli 1598793861 25.14
Geriatric Medicine Cherukuri 1750375267 20.23
Gynecological Oncology Hahn 1457604738 38.59
Gynecological Oncology South 1336368547 31.40
Gynecological Oncology Stine 1720310089 31.20
Hand Surgery Chan 1114129236 38.52
Hand Surgery Klein 1295773059 35.30
Hand Surgery Miller 1154397297 34.78
Hematology Bergier 1528098720 50.41
Hematology Giglia 1417004201 18.99
Hematology Muhammad 1942242938 18.24
Hematology-Oncology Maun 1518940246 22.85
Hematology-Oncology Mulaparthi 1427048537 20.60
Hematology-Oncology Harandi 1003893199 20.40
Hematology-Oncology Gonter 1295726933 20.40
Hematopoietic Cell Transplantation and Cellular Therapy Hansen 1881958270 8.39
Hematopoietic Cell Transplantation and Cellular Therapy Jain 1982139655 8.21
Hematopoietic Cell Transplantation and Cellular Therapy Mishra 1225225865 8.19
Home Infusion Therapy Services Coram Healthcare Corporation Of Florida 1417907627 0.29
Hospice and Palliative Care Suggs 1497710172 66.20
Hospice and Palliative Care Uzabel 1427019918 58.77
Hospice and Palliative Care Akkineni 1205099744 57.31
Hospitalist Patel 1427126036 61.49
Hospitalist Garcia 1639188618 61.49
Hospitalist Chandler 1487041620 61.47
Independent Diagnostic Testing Facility (IDTF) Bowes Imaging Center Llc. 1316985658 77.65
Independent Diagnostic Testing Facility (IDTF) Diagnostic Medical Testing Inc 1083672208 63.97
Independent Diagnostic Testing Facility (IDTF) Bowes Imaging Center, Llc 1104198258 43.01
Infectious Disease Schreibman 1609877190 35.06
Infectious Disease Cancel 1740246644 34.90
Infectious Disease Nguyen 1811159262 23.63
Internal Medicine Kadari 1760046171 44.90
Internal Medicine Vyas 1447690987 44.25
Internal Medicine Kanapathippillai 1952530420 43.87
Interventional Cardiology Sivasankaran 1265520043 26.91
Interventional Cardiology Khant 1811959307 25.53
Interventional Cardiology Srivastava 1932209608 24.10
Interventional Pain Management Jassal 1255625513 32.64
Interventional Pain Management Ramos 1831150630 32.34
Interventional Pain Management Samlaska 1548291172 30.62
Interventional Radiology Nieves-Cruz 1174703375 56.46
Interventional Radiology Niedzwiecki 1578632147 47.87
Interventional Radiology Raymond 1770856999 45.22
Licensed Clinical Social Worker Mcdaniel 1144729427 1.07
Licensed Clinical Social Worker Thekkethottiyil 1023376894 1.06
Licensed Clinical Social Worker Rosendale 1366747115 1.06
Licensed Clinical Social Worker Cunningham 1457494270 1.06
Licensed Clinical Social Worker Westfield 1598070187 1.06
Licensed Clinical Social Worker Goodhand 1902128879 1.06
Licensed Clinical Social Worker Eakes 1912477712 1.06
Licensed Clinical Social Worker Yewell 1922517556 1.06
Licensed Clinical Social Worker Johnston 1942469648 1.06
Mammography Center Adventhealth West Florida Imaging Inc 1316581234 1.25
Mass Immunizer Roster Biller Jahd Inc 1275070716 18.37
Mass Immunizer Roster Biller Samson Merger Sub, Llc 1932521358 18.37
Mass Immunizer Roster Biller Samson Merger Sub, Llc 1700208121 18.35
Maxillofacial Surgery Chuong 1558369678 15.34
Maxillofacial Surgery Hothersall 1902912355 6.05
Medical Oncology Audeh 1104817063 47.45
Medical Oncology Berry 1669463378 42.28
Medical Oncology Lunin 1003807645 41.86
Micrographic Dermatologic Surgery Worley 1457823809 26.52
Micrographic Dermatologic Surgery Epstein 1356588974 26.47
Micrographic Dermatologic Surgery Ewanowski 1780880476 24.19
Nephrology Carvallo 1053416412 87.40
Nephrology Badillo 1871588350 84.57
Nephrology Abdel Rahman 1134383615 84.54
Neurology Aung-Din 1215987912 75.46
Neurology Reddy 1952368433 70.66
Neurology Malka 1235176322 64.77
Neuropsychiatry Bazargan 1265461941 1.35
Neurosurgery Vogelbaum 1598732380 96.63
Neurosurgery Haydon 1669677514 96.25
Neurosurgery Etame 1639292287 96.00
Nuclear Medicine Carter 1659582286 21.00
Nuclear Medicine Miliziano 1336154608 20.71
Nuclear Medicine Taher 1386829984 18.87
Nurse Practitioner Rexroth 1992921977 63.86
Nurse Practitioner Hawkins 1427619329 63.66
Nurse Practitioner Hazelwood 1851394274 63.46
Obstetrics & Gynecology Cox 1023021789 41.23
Obstetrics & Gynecology Barreiro 1093865743 41.23
Obstetrics & Gynecology Wahba 1124178116 41.23
Obstetrics & Gynecology Mersch 1154858322 41.23
Obstetrics & Gynecology Michael 1508155904 41.23
Obstetrics & Gynecology Watkins 1528054509 41.23
Obstetrics & Gynecology Hardy-Hunter 1598940074 41.23
Obstetrics & Gynecology Calderon 1669478939 41.23
Obstetrics & Gynecology Mcneill 1740548700 41.23
Obstetrics & Gynecology Miller 1790771731 41.23
Obstetrics & Gynecology Curtis 1821402868 41.23
Obstetrics & Gynecology Gomez 1851522825 41.23
Obstetrics & Gynecology Vonthron 1881680825 41.23
Obstetrics & Gynecology Weiss 1922094986 41.23
Occupational Therapist in Private Practice Martin 1285759555 15.75
Occupational Therapist in Private Practice Koesling 1295388635 15.73
Occupational Therapist in Private Practice Finley 1730707035 15.72
Ophthalmology Bovee 1427346683 77.31
Ophthalmology Martino 1073702031 68.45
Ophthalmology Levitt 1720074974 63.51
Opioid Treatment Program Operation Par Inc 1477611903 2.08
Opioid Treatment Program Maps Sarasota 1477810638 2.08
Opioid Treatment Program Operation Par Inc 1639237001 2.08
Opioid Treatment Program Operation Par Inc 1972661411 2.08
Optometry Johnson 1114925773 11.66
Optometry Buffano 1275765265 11.15
Optometry Sorkin 1902939010 10.71
Oral Surgery (Dentist only) Moore 1164564951 17.42
Oral Surgery (Dentist only) Vorwald 1689961385 13.41
Oral Surgery (Dentist only) Banachowski 1033484944 7.14
Orthopedic Surgery Cronin 1821479759 63.01
Orthopedic Surgery Hess 1578568291 60.50
Orthopedic Surgery Klima 1518103316 57.50
Osteopathic Manipulative Medicine Von Lindeman 1629146816 35.99
Osteopathic Manipulative Medicine Jones 1184129983 29.75
Osteopathic Manipulative Medicine Miranda 1225202575 29.48
Otolaryngology Parasher 1962795781 27.32
Otolaryngology Pfaff 1952664609 26.86
Otolaryngology Byers 1710935911 25.93
Pain Management Angirekula 1710942206 78.86
Pain Management Latif 1568434868 75.47
Pain Management Lubin 1508834706 73.04
Pathology Migliozzi 1013064021 98.56
Pathology Rojas 1134383060 96.04
Pathology Mittal 1992742084 96.04
Pediatric Medicine Rucker 1194721522 35.30
Pediatric Medicine Patel 1770005902 34.21
Pediatric Medicine Mesghali Morales 1336362706 31.03
Pharmacy Maa Bhawani Inc 1003104779 20.16
Pharmacy Balls Rexall Drugs Inc 1801908736 18.55
Pharmacy Maahi Llc 1184024788 18.08
Physical Medicine and Rehabilitation Genato 1366468282 42.94
Physical Medicine and Rehabilitation Razvi 1679859532 41.98
Physical Medicine and Rehabilitation Lockett 1356410443 33.95
Physical Therapist in Private Practice Hauskey 1578997052 1.73
Physical Therapist in Private Practice Thomas 1932457330 1.06
Physical Therapist in Private Practice De La Torre 1336516772 0.98
Physician Assistant Elomina 1831811090 72.75
Physician Assistant Daignault 1528652120 72.32
Physician Assistant Radhakrishnan 1265150890 72.31
Plastic and Reconstructive Surgery Shulman 1972690329 60.64
Plastic and Reconstructive Surgery Dayicioglu 1164674081 58.15
Plastic and Reconstructive Surgery Smith 1891707915 45.95
Podiatry Marsh 1093722548 18.45
Podiatry Williams 1477530905 17.64
Podiatry Zelna 1376885517 16.55
Portable X-Ray Supplier Radiance Radiology Inc 1134459720 13.16
Portable X-Ray Supplier Florida Imaging Specialists, Pa 1790328185 9.41
Portable X-Ray Supplier Professional Portable Radiologic Services Inc. 1033780051 9.02
Preventive Medicine Senatorov 1679881809 38.96
Preventive Medicine Patel 1750720025 24.10
Preventive Medicine Deveau 1528301132 19.89
Psychiatry Qureshi 1497166656 63.87
Psychiatry Shahid 1598184145 63.83
Psychiatry Patel 1205216058 62.59
Psychologist, Clinical Cortman 1629160775 22.90
Psychologist, Clinical Todoroff 1033129804 19.18
Psychologist, Clinical Simpson 1194779470 18.70
Pulmonary Disease Plum 1770902835 44.72
Pulmonary Disease Miller 1790050979 33.69
Pulmonary Disease Cheing 1750701652 32.69
Radiation Oncology Yu 1740386358 45.15
Radiation Oncology Oliver 1053707521 44.96
Radiation Oncology Scott 1679734370 36.06
Registered Dietitian or Nutrition Professional Grillo 1538678347 0.92
Registered Dietitian or Nutrition Professional Solberger 1740882141 0.35
Registered Dietitian or Nutrition Professional Riley 1033153788 0.31
Registered Dietitian or Nutrition Professional Timmons 1265593644 0.31
Registered Dietitian or Nutrition Professional Harbison 1407832462 0.31
Registered Dietitian or Nutrition Professional Dominguez 1568128676 0.31
Registered Dietitian or Nutrition Professional Figueroa 1629591672 0.31
Registered Dietitian or Nutrition Professional Ulm 1780915116 0.31
Registered Dietitian or Nutrition Professional Gilpin 1871163949 0.31
Rheumatology Johnston 1932332384 43.20
Rheumatology Drucker 1790760783 41.42
Rheumatology Zito 1407992597 34.47
Sleep Medicine Agarwal 1932497773 16.46
Sleep Medicine Anderson 1699783845 11.23
Sleep Medicine Guzman 1881198547 9.05
Speech Language Pathologist Mccord 1194072017 37.84
Speech Language Pathologist Thornberry 1831822717 37.84
Speech Language Pathologist Rejonis 1720361595 37.82
Sports Medicine Yi 1235367483 66.23
Sports Medicine Gorman 1659507507 59.98
Sports Medicine Dhani 1366949935 48.54
Surgical Oncology Moskal 1619949831 36.40
Surgical Oncology Meredith 1659329217 35.87
Surgical Oncology Laronga 1659398402 31.37
Thoracic Surgery Angell 1205870870 92.69
Thoracic Surgery Baldonado 1992221444 84.54
Thoracic Surgery Lambert 1558369454 73.18
Undefined Physician type Kennon 1053675868 66.37
Undefined Physician type Sweeney 1215103775 17.58
Undefined Physician type Hunt 1871579946 13.70
Undersea and Hyperbaric Medicine Latimer-Pierson 1740431469 10.88
Undersea and Hyperbaric Medicine Valdes 1912958588 6.29
Urology Graves 1164684239 29.27
Urology Everett 1780039743 23.15
Urology Szell 1124316088 23.05
Vascular Surgery Nedurian 1417942681 53.24
Vascular Surgery Davis 1023335601 48.17
Vascular Surgery Purcell 1144587841 46.27

📄 Summary of Association Rule Findings

K-Means Clustering: Christopher Reddish, Sneh Patel

📘 Introduction

The primary goal of this analysis is to analyze Medicare provider data using k-means clustering, assign meaningful cluster labels to providers based on cost and service volume, and visualize the geographic distribution of these clusters across ZIP codes in Florida using an interactive map.

🔹 Step 1: Prepare and Scale Data for Clustering

# ---- Step 3: K-Means Clustering ----
# K-Means Clustering
df_raw <- filtered_data %>%
  mutate(
    across(
      .cols = c(Total_Services, Avg_Submitted_Charge, Avg_Medicare_Payment),
      .fns = ~as.numeric(as.character(.))
    )
  ) %>%
  select(NPI, Total_Services, Avg_Submitted_Charge, Avg_Medicare_Payment) %>%
  na.omit()

stopifnot(all(sapply(df_raw, is.numeric)))

df_scaled <- scale(df_raw)

🔹 Step 2: Perform K-Means Clustering

k <- min(4, nrow(unique(df_scaled)))
if (k < 2) stop("Not enough data to form clusters.")

set.seed(123)
clusters <- kmeans(df_scaled, centers = k, nstart = 25)

🔹 Step 3: Assign Cluster Labels

cluster_labels <- c(
  "1" = "Moderate Cost, High Volume",
  "2" = "Typical Providers",
  "3" = "High Cost, Low Volume",
  "4" = "Low Cost, Very High Volume"
)

results <- df_raw %>%
  mutate(
    Cluster = clusters$cluster,
    Cluster_Label = cluster_labels[as.character(Cluster)]
  )

🔹 Step 4: Prepare Geographic and Provider Locations

# Mapping
zcta_shapes <- tigris::zctas(cb = TRUE, year = 2020) %>%
  rename(Zip = ZCTA5CE20) %>%
  mutate(Zip = sprintf("%05s", as.character(Zip)))

# Create NPI to ZIP mapping necessary for join
npi_zip_map <- filtered_data %>%
  mutate(NPI = as.character(NPI)) %>%
  select(NPI, Provider_Zip) %>%
  distinct(NPI, .keep_all = TRUE)

🔹 Step 5: Aggregate Cluster Data by ZIP code

zip_cluster_data <- results %>%
  mutate(NPI = as.character(NPI)) %>%
  left_join(npi_zip_map, by = "NPI") %>%
  group_by(Provider_Zip, Cluster_Label) %>%
  summarise(Count = n(), .groups = "drop")

🔹 Step 6: Join Cluster Aggregates to Geopgrahic Shapes

zip_cluster_sf <- left_join(zcta_shapes,
                            zip_cluster_data %>%
                              mutate(Provider_Zip = sprintf("%05s", as.character(Provider_Zip))),
                            by = c("Zip" = "Provider_Zip")) %>%
  filter(!is.na(Cluster_Label)) %>%
  st_make_valid() %>%
  filter(st_geometry_type(geometry) %in% c("POLYGON", "MULTIPOLYGON")) %>%
  st_transform(4326)

🔹 Step 7: Define Map Aesthetics

zip_cluster_sf <- zip_cluster_sf %>%
  mutate(
    fill_color = dplyr::recode(Cluster_Label,
      "Moderate Cost, High Volume" = "#1f78b4",
      "Typical Providers" = "#33a02c",
      "High Cost, Low Volume" = "#e31a1c",
      "Low Cost, Very High Volume" = "#ff7f00",
      .default = "#cccccc"
    ),
    opacity = scales::rescale(Count, to = c(0.4, 1))
  )

🔹 Step 8: Create and Display Interactive Leaflet Map

# Interactive Leaflet Map
leaflet_map <- leaflet::leaflet(zip_cluster_sf) %>%
  leaflet::addProviderTiles(leaflet::providers$CartoDB.Positron) %>%
  leaflet::addPolygons(
    fillColor = ~fill_color,
    fillOpacity = ~opacity,
    color = "#444444",
    weight = 0.5,
    popup = ~paste0(
      "<b>ZIP:</b> ", Zip, "<br>",
      "<b>Cluster:</b> ", Cluster_Label, "<br>",
      "<b>Provider Count:</b> ", Count
    )
  ) %>%
  leaflet::addLegend(
    "bottomright",
    colors = c("#1f78b4", "#33a02c", "#e31a1c", "#ff7f00"),
    labels = c(
      "Moderate Cost, High Volume",
      "Typical Providers",
      "High Cost, Low Volume",
      "Low Cost, Very High Volume"
    ),
    title = "Cluster Type",
    opacity = 1
  )

🔹 Step 9: Save and Output Map

htmlwidgets::saveWidget(leaflet_map, "Output/Medicare_Clusters_FL.html", selfcontained = TRUE)

leaflet_map

📄 Summary of K-Means Clustering Findings

According to the provider statistics, Cluster 2 is the most common, accounting for the majority of providers, while Cluster 3 is under-represented, implying potential clustering imbalances. Cluster 2 has a wide range of values for total services, average submitted charge, and average Medicare payment, with average Medicare payments at $70.23, average submitted charges at $356.84, and an average of 604 total services; however, Cluster 3 has higher costs and lower service volumes, with average Medicare payments at $425.36, average submitted charges at $4,490.86, and an average of 29 total services. These findings indicate that the clustering technique should be reconsidered, possibly using new algorithms or feature engineering, as well as a more rigorous outlier analysis within Cluster 2 to assess whether some providers should be reclassified in order to establish more distinct and balanced groups.

Linear Regression: Steven Barden, Scott Young

📘 Introduction

The primary goal of this methodology is to identify and understand factors influencing average Medicare payments.

🔹 Step 1: Prepare Data for Linear Mixed-Effects Model

# Load and clean with basic numeric conversion
df_lm <- filtered_data %>%
  select(
    Avg_Medicare_Payment,
    Total_Services,
    Avg_Submitted_Charge,
    Total_Beneficiaries,
    Place_Of_Service,
    Provider_Type,
    RUCA_Description
  ) %>%
  mutate(
    Medicare_Payment = parse_number(as.character(Avg_Medicare_Payment)),
    Services = parse_number(as.character(Total_Services)),
    Submitted_Charge = parse_number(as.character(Avg_Submitted_Charge)),
    Beneficiaries = parse_number(as.character(Total_Beneficiaries)),
    Place_Of_Service = factor(Place_Of_Service),
    Specialty = factor(Provider_Type),
    RUCA_Description = factor(RUCA_Description)
  ) %>%
  select(-Avg_Medicare_Payment, -Total_Services, -Avg_Submitted_Charge, -Total_Beneficiaries, -Provider_Type) %>%
  filter(Medicare_Payment > 1.00) %>%
  mutate(
    Log_Medicare_Payment = log(Medicare_Payment)
  ) %>%
  filter(is.finite(Log_Medicare_Payment)) %>%
  mutate(
    across(c(Services, Submitted_Charge, Beneficiaries),
           list(z = ~scale(.)),
           .names = "{.col}_z")
  ) %>%
  group_by(Specialty) %>%
  filter(n() >= 50) %>%
  ungroup %>%
  drop_na() %>%
  select(
    Log_Medicare_Payment,
    Services_z, Submitted_Charge_z, Beneficiaries_z,
    Place_Of_Service, RUCA_Description,
    Specialty
  )

🔹 Step 2: Fit Linear Mixed-Effects Model

# Run simplified regression (removed HCPCS_Description)
mixed_model <- lmer(
  Log_Medicare_Payment ~ Services_z + Submitted_Charge_z + Beneficiaries_z +
    Place_Of_Service + RUCA_Description +
    (1 | Specialty),
  data = df_lm,
)

🔹 Step 3: Raw Summary of the Model

# Raw Model Summary
summary(mixed_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## Log_Medicare_Payment ~ Services_z + Submitted_Charge_z + Beneficiaries_z +  
##     Place_Of_Service + RUCA_Description + (1 | Specialty)
##    Data: df_lm
## 
## REML criterion at convergence: 483082.8
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -14.3763  -0.4859   0.1073   0.6289   6.4610 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  Specialty (Intercept) 0.1946   0.4412  
##  Residual              1.0409   1.0203  
## Number of obs: 167701, groups:  Specialty, 76
## 
## Fixed effects:
##                                                                                                                       Estimate
## (Intercept)                                                                                                           4.174173
## Services_z                                                                                                           -0.021389
## Submitted_Charge_z                                                                                                    0.337635
## Beneficiaries_z                                                                                                      -0.002353
## Place_Of_ServiceO                                                                                                    -0.221650
## RUCA_DescriptionMetropolitan area high commuting: primary flow 30% or more to a urbanized area of 50,000 and greater -0.291418
## RUCA_DescriptionMetropolitan area low commuting: primary flow 10% to <30% to a urbanized area of 50,000 and greater   0.114430
## RUCA_DescriptionMicropolitan area core: primary flow within an urban cluster of 10,000 to 49,999                     -0.084599
## RUCA_DescriptionSecondary flow 30% to <50% to a larger urbanized area of 50,000 and greater                           0.047710
## RUCA_DescriptionSmall town core: primary flow within an urban cluster of 2,500 to 9,999                              -0.212993
##                                                                                                                      Std. Error
## (Intercept)                                                                                                            0.051182
## Services_z                                                                                                             0.003050
## Submitted_Charge_z                                                                                                     0.002778
## Beneficiaries_z                                                                                                        0.003073
## Place_Of_ServiceO                                                                                                      0.006479
## RUCA_DescriptionMetropolitan area high commuting: primary flow 30% or more to a urbanized area of 50,000 and greater   0.017488
## RUCA_DescriptionMetropolitan area low commuting: primary flow 10% to <30% to a urbanized area of 50,000 and greater    0.067628
## RUCA_DescriptionMicropolitan area core: primary flow within an urban cluster of 10,000 to 49,999                       0.087376
## RUCA_DescriptionSecondary flow 30% to <50% to a larger urbanized area of 50,000 and greater                            0.011288
## RUCA_DescriptionSmall town core: primary flow within an urban cluster of 2,500 to 9,999                                0.242755
##                                                                                                                      t value
## (Intercept)                                                                                                           81.556
## Services_z                                                                                                            -7.014
## Submitted_Charge_z                                                                                                   121.520
## Beneficiaries_z                                                                                                       -0.766
## Place_Of_ServiceO                                                                                                    -34.210
## RUCA_DescriptionMetropolitan area high commuting: primary flow 30% or more to a urbanized area of 50,000 and greater -16.664
## RUCA_DescriptionMetropolitan area low commuting: primary flow 10% to <30% to a urbanized area of 50,000 and greater    1.692
## RUCA_DescriptionMicropolitan area core: primary flow within an urban cluster of 10,000 to 49,999                      -0.968
## RUCA_DescriptionSecondary flow 30% to <50% to a larger urbanized area of 50,000 and greater                            4.227
## RUCA_DescriptionSmall town core: primary flow within an urban cluster of 2,500 to 9,999                               -0.877
## 
## Correlation of Fixed Effects:
##                   (Intr) Srvcs_ Sbm_C_ Bnfcr_ P_O_SO Rahcpf3omtauao5ag
## Services_z        -0.001                                              
## Sbmttd_Chr_       -0.009  0.002                                       
## Beneficrs_z        0.001 -0.572  0.001                                
## Plc_Of_SrvO       -0.085 -0.001  0.067 -0.001                         
## Rahcpf3omtauao5ag -0.002  0.000  0.000 -0.002 -0.031                  
## Ralcpf1t<tauao5ag -0.001  0.002 -0.002 -0.001 -0.009  0.007           
## Racpfwauco1t4     -0.002  0.001  0.000  0.000  0.000  0.005           
## Rf3t<taluao5ag    -0.007  0.000  0.000  0.001 -0.026  0.036           
## Rtcpfwauco2t9     -0.013  0.000  0.003  0.000 -0.008  0.001           
##                   Ralcpf1t<tauao5ag Racpfwauco1t4 Rf3t<taluao5ag
## Services_z                                                      
## Sbmttd_Chr_                                                     
## Beneficrs_z                                                     
## Plc_Of_SrvO                                                     
## Rahcpf3omtauao5ag                                               
## Ralcpf1t<tauao5ag                                               
## Racpfwauco1t4      0.002                                        
## Rf3t<taluao5ag     0.010             0.008                      
## Rtcpfwauco2t9      0.000             0.000         0.001
# Fixed Effects Summary
knitr::kable(tidy(mixed_model, effects = "fixed", conf.int = TRUE), digits =3)
effect term estimate std.error statistic conf.low conf.high
fixed (Intercept) 4.174 0.051 81.556 4.074 4.274
fixed Services_z -0.021 0.003 -7.014 -0.027 -0.015
fixed Submitted_Charge_z 0.338 0.003 121.520 0.332 0.343
fixed Beneficiaries_z -0.002 0.003 -0.766 -0.008 0.004
fixed Place_Of_ServiceO -0.222 0.006 -34.210 -0.234 -0.209
fixed RUCA_DescriptionMetropolitan area high commuting: primary flow 30% or more to a urbanized area of 50,000 and greater -0.291 0.017 -16.664 -0.326 -0.257
fixed RUCA_DescriptionMetropolitan area low commuting: primary flow 10% to <30% to a urbanized area of 50,000 and greater 0.114 0.068 1.692 -0.018 0.247
fixed RUCA_DescriptionMicropolitan area core: primary flow within an urban cluster of 10,000 to 49,999 -0.085 0.087 -0.968 -0.256 0.087
fixed RUCA_DescriptionSecondary flow 30% to <50% to a larger urbanized area of 50,000 and greater 0.048 0.011 4.227 0.026 0.070
fixed RUCA_DescriptionSmall town core: primary flow within an urban cluster of 2,500 to 9,999 -0.213 0.243 -0.877 -0.689 0.263
# Random Effects Summary
knitr::kable(tidy(mixed_model, effects = "ran_pars", scales = "vcov"), digits = 3)
effect group term estimate
ran_pars Specialty var__(Intercept) 0.195
ran_pars Residual var__Observation 1.041

🔹 Step 4: Analyze and Plot Key Fixed Effects

# Key fixed effect drivers
fixed_effects <- tidy(mixed_model,
                      effects = "fixed",
                      conf.int = TRUE)

plot_data_fixed <- fixed_effects %>%
    filter(term %in% c("Submitted_Charge_z", "Place_Of_ServiceO")) %>%
  mutate(
    # Create clearer names for the plot axis
    term_cleaned = case_when(
      term == "Submitted_Charge_z" ~ "Submitted Charge Impact",
      term == "Place_Of_ServiceO" ~ "Office vs. Facility Impact",
      TRUE ~ term
    )
  )

plot_key_drivers <- ggplot(plot_data_fixed, aes(x = estimate, y = reorder(term_cleaned, estimate))) +
    geom_vline(xintercept = 0, linetype = "dashed", color = "grey70") +
    geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.15, linewidth = 0.7) +
    geom_point(size = 3.5, color = "dodgerblue") +
    labs(
      title = "Key Factors Influencing Average Medicare Payment",
      subtitle = "Estimated Impact (Positive or Negative)",
      x = "Estimated Impact on Medicare Payment",
      y = "Factor"
    ) +
    theme_minimal(base_size = 14) +
    theme(panel.grid.major.y = element_blank())

ggsave("Output/plot_key_drivers_opaque.png",
       plot = plot_key_drivers,
       width = 8,
       height = 4,
       bg = "white"
)
ggsave("Output/plot_key_drivers_transparent.png",
       plot = plot_key_drivers,
       width = 8,
       height = 4
)
# Display the plot
plot_key_drivers

🔹 Step 5: Analyze and Plot Random Effects

# Variation between specialties
specialty_effects <- ranef(mixed_model,
                           condVar = FALSE)$Specialty %>%
  rename(Intercept_Deviation = `(Intercept)`) %>%
  tibble::rownames_to_column("Specialty")

plot_specialty_variation <- ggplot(specialty_effects, aes(x = Intercept_Deviation, y = reorder(Specialty, Intercept_Deviation))) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey70") +
  geom_point(size = 2, color = "forestgreen") +
  labs(
    title = "Variation in Baseline Payments Across Specialties",
    subtitle = "Deviation from Average Baseline Medicare Payment",
    x = "Estimated Deviation from Average Baseline (Medicare Payment)",
    y = "Medical Specialty"
  ) +
  theme_minimal(base_size = 14) +
  theme(
     axis.text.y = element_text(size = 8),
     panel.grid.major.y = element_blank()
  )

ggsave(
    "Output/plot_specialty_variation_opaque.png",
    plot = plot_specialty_variation,
    width = 12,
    height = 15,
    units = "in",
    dpi = 300,
    bg = "white"
)
ggsave(
    "Output/plot_specialty_variation_transparent.png",
    plot = plot_specialty_variation,
    width = 12,
    height = 15,
    units = "in",
    dpi = 300
)
# Display the plot
plot_specialty_variation

📄 Summary of Linear Regression Findings

After analyzing 170,000 Medicare claims across the Tampa Bay area, this statistical model identifies key payment drivers. It uses a linear mixed-effects methodology that is a subset of linear regression to find differences between 76 different provider types while accounting for average charge amounts.

Heat Map Visual: Khushi Patel, Naomi Vaid

📘 Introduction

The primary goal of the heat map visualization was to see at a glance average Medicare costs in the Tampa Bay region.

🔹 Step 1: Define Input File Path for Shapefiles

# File paths
shapefile_path <- "Data/tl_2010_12_zcta510.shp"

🔹 Step 2: Load and Prepare Medicare Data for Visualization

# Load Medicare data
medicare_cleaned <- filtered_data %>%
  mutate(
    Provider_Zip = str_pad(substr(as.character(Provider_Zip), 1, 5), 5, pad = "0"),
    Avg_Medicare_Payment = as.numeric(gsub("[$,]", "", Avg_Medicare_Payment))
  )

🔹 Step 3: Load Geographic Data Shapefile

# Load shapefile
zcta <- st_read(shapefile_path) %>%
  mutate(ZCTA5CE10 = as.character(ZCTA5CE10))
## Reading layer `tl_2010_12_zcta510' from data source 
##   `C:\Users\scott\Documents\School\LIS4761\Final\LIS4761_Final\CMS Final Project\Data\tl_2010_12_zcta510.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 983 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -87.62589 ymin: 24.51748 xmax: -80.02711 ymax: 31.00097
## Geodetic CRS:  NAD83

🔹 Step 4: Set Analysis Parameters

# Choose specialty
selected_specialty <- "Internal Medicine"

🔹 Step 5: Filter and Summarize Data by Specialty and ZIP

# Filter and summarize
hm_filtered_data <- medicare_cleaned %>%
  filter(Provider_Type == selected_specialty) %>%
  group_by(Provider_Zip) %>%
  summarize(
    Avg_Payment = mean(Avg_Medicare_Payment, na.rm = TRUE),
    Providers = n(),
    .groups = "drop"
  )

🔹 Step 6: Merge Geographic and Summarized Medicare Data

# Merge with shapefile
merged_data <- zcta %>%
  left_join(hm_filtered_data, by = c("ZCTA5CE10" = "Provider_Zip"))

🔹 Step 7: Filter Merged Data to Tampa Bay Region

# Filter with Tampa Bay ZIP codes
tampa_map_data <- merged_data %>%
  filter(ZCTA5CE10 %in% tampa_bay_zips)

🔹 Step 8 Create Map Plot

# Create heat map
tampa_plot <- ggplot(data = tampa_map_data) +
  geom_sf(aes(fill = Avg_Payment), color = "white", size = 0.2) +
  scale_fill_viridis_c(
    option = "plasma",
    na.value = "grey90",
    name = "Avg Payment ($)",
    limits = c(0, 150),
    breaks = c(0, 50, 100, 150),
    oob = scales::squish
  ) +
  labs(
    title = "Medicare Average Payment by ZIP Code in Tampa Bay",
    subtitle = paste("Specialty:", selected_specialty),
  ) +
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "right",
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 13),
    plot.caption = element_text(size = 10, face = "italic")
  )

# Save the heat map to the output folder
ggsave(
  filename = "Output/heatmap_transparent.png",
  plot = tampa_plot,
  width = 10,
  height = 6,
  dpi = 300
)

ggsave(
  filename = "Output/heatmap_opaque.png",
  plot = tampa_plot,
  width = 10,
  height = 6,
  dpi = 300,
  bg = "white"
)

# Display the plot
tampa_plot

📄 Summary of Heat Map Visualization Findings

Here is a summary of our analysis based on Medicare payment data visualization:

We analyzed average Medicare payments for providers specializing in Internal Medicine across Tampa Bay ZIP codes. This process began by loading and preparing a cleaned Medicare dataset and a shapefile representing Florida ZIP Code Tabulation Areas (ZCTAs).

After filtering the data for the selected specialty (“Internal Medicine”), we calculated the average Medicare payment and the number of providers per ZIP code. This summarized dataset was then merged with the shapefile data to enable geographic plotting.

Using ggplot2 and the sf package, we created a choropleth map showing the distribution of average Medicare payments across Tampa Bay. Areas with higher average payments appear in brighter plasma shades, while areas with lower payments or no data are shown in muted tones.

The heatmap helps reveal regional disparities in Medicare reimbursements, highlighting ZIP codes with notably higher or lower average payments. These visual insights can inform further investigation into geographic trends in healthcare spending, potential inequities in reimbursement rates, or gaps in provider coverage for Internal Medicine in Tampa Bay.